############################################
############################################
# previous: 3-VAP_MaxentPrep ###############
############################################

###### Summary ############################################################################################
#This script summaries the results from the full maxent run with all variables for each modelling unit and determines which variables performed best,
#based on ranking the average permutation importance of each predictor variable across the 10 crossvalidated model runs. It then subsets the original 
#swd and background files and reruns models using only the best performing variables for each modelling unit. It also creates a summary file pf performance 
#of each variable across all species. The final variables used for each species are at least 5 even for species with few (less than 100) occurrence records
#but beyond that exclude any variables with a permutation importance below 1% and limit the number of variables to a maximum of 5% of the number of 
#occurrence records (e.g. a species with 200 occurrence records can not have more than 10 final predictor variables) - this is a commonly used standard to 
#avoid overfitting. Lastly the script creates shell files to run a 10-fold 70/30% training/test data cross-validated model for statistical verification of 
#results, as well as a final model using all occurrence records (without extrapolating beyond conditions used in predictors), and a map projection of predicted
#habitat suitability at current conditions using the original rasters used for the creation of swd files. It also asks maxent create a limiting factor map, 
#showing which variable most limits habitat suitability for each modelling unit in each grid cell.
###########################################################################################################

############################################
# next: 5-VAP_CurrentPlotsThresh ###########
############################################

#######################################################################################################
##### MaxEnt3 Variable Ranking & Selection ############################################################
#######################################################################################################

#load packages:
library(R.utils)
library(raster)
library(maptools)
library(rgdal)
library(stringr)
library(sf)



#######################################################################################################
#### the below settings may change depending on the project ###########################################
#######################################################################################################

#set project name:
projectName<-"WHOSnakes"

#set resolution in decimal degrees:
res<- 0.01

#number of crossvalidations to run in Maxent
crossvals<- 10

#define prefix that determines categorical variables for Maxent:
catVars<-"cat_" 

#should each species' records added to the original target 
#background before running the final model? "no" or "yes":
addtbg<-"no" 

#define selection criteria of interest for further analyses:
crits<-c("permutation.importance")   #c("permutation.importance","contribution","Test.gain.without","Test.gain.with.only","AUC.without","AUC.with.only")
critThresh<-c(1)                     #c(1,1,1,10,1,75)

#define layer directory to use for data extractions and/or projections:
LayersDir<-paste("/home/jc217070/Maxent/Layers")



#######################################################################################################
##### the below settings should all stay the same in most cases #######################################
#######################################################################################################

#define Maxent directories and subdirectories:
MyDir<-paste("/home/jc217070/Maxent",sep="")
# define location of maxent.jar file
maxent.jar <- paste(MyDir,"/maxent.jar",sep="")

#read species reference list:
DatSum<-read.table(file=file.path(paste(MyDir, "/Maxent_LUTables/",projectName,".csv",sep="")), header=TRUE, sep=",") #load data summary for all species
spp<-DatSum$gen_sp_subtax

FullSDMDir<-paste(MyDir,"/Outputs1_Full",sep="")
OutDirCrossval<-paste(MyDir,"/Outputs2_Crossval",sep="")
OutDirFinal<-paste(MyDir,"/Outputs3_Final",sep="")

SWDDir<-paste(MyDir, "/Maxent_SWDs/spp_swds",sep="")
BackgrDir<-paste(MyDir, "/Maxent_SWDs/backgrounds",sep="")
BGforVars<-paste(BackgrDir,"/ModUnit_",spp[1],"_BG.csv",sep="")

#create & define PBS directory for output files (errors and R outs)
dir.create(path=file.path(MyDir,"tmp.pbs"),showWarnings = FALSE)
PbsDir <- paste(MyDir, "/tmp.pbs", sep="")
setwd(PbsDir)


#save a modified copy of the base spreadsheet with final number of swd recs per species in it:
DatSum$SPswdrecs<-0
for (sp in spp){
  if(file.exists(paste(SWDDir, "/",sp,".csv",sep=""))){
  sprecs<-read.table(file=file.path(paste(SWDDir, "/",sp,".csv",sep="")), header=TRUE, sep=",") #load data summary for all species
  DatSum$SPswdrecs[DatSum$gen_sp_subtax==sp]<-length(sprecs[,1])
  Sys.sleep(0.1)
  print(paste(Sys.time(), sp, length(sprecs[,1]), sep=" "))
  }
}
write.table(x=DatSum, file=paste(MyDir, "/Maxent_LUTables/",projectName,"_Fin3b.csv",sep=""),   
            na = "NA", row.names = FALSE, col.names = TRUE, sep=",")

#Make empty data frames for results summary:
maxEntRes<-data.frame() #create empty data frame for summarized results for all species
spExists<-vector()

#get names of all variables used in full base models:
allVars<-read.table(file=paste(BGforVars), header=TRUE, sep=",") #load data summary for all species
allVars<-names(allVars)[4:length(allVars)] #names of all variables used across models



####################################################################
## general information for each species/ modelling unit ############
####################################################################

mapspp<-unique(DatSum$ModUnit)
#mapsppB<-unique(DatSum$ModUnit)
# mapsppB<-unique(DatSum$ModUnit [DatSum$MergeShort %in% c("Bitis arietans","Bitis gabonica group","Bitis nasicornis",
#                                                          "Cerastes cerastes complex","Echis coloratus super-complex",
#                                                          "Dendroaspis polylepis","Dendroaspis viridis group",
#                                                          "Naja-Afronaja complex","Naja-Uraeus complex",
#                                                          "Naja-Boulengerina annulata group","Naja-Boulengerina melanoleuca group",
#                                                          "Pseudohaje spp.","Thelotornis spp.","Dispholidus typus")])

#mapsppB<-unique(DatSum$ModUnit[DatSum$Priority==1 | DatSum$Priority==0])
mapsppB<-unique(DatSum$ModUnit[DatSum$Redo==1])
#mapsppB<-unique(DatSum$ModUnit[DatSum$Priority==0])
#mapsppB<-c("Walterinnesia aegyptia","Walterinnesia morgani")

#make tet file with qsub commands list for all species' shell files
qsub.list.file <- '/home/jc217070/Maxent/R_scripts/qsubs-4-VAP.txt'

for (sp in mapspp){
    if(sp %in% mapsppB){
      
    n<-which(DatSum$ModUnit==sp) #gets row number for current species
    sp<-gsub(" ","_",sp)
    SPSDMDir<-paste(FullSDMDir,"/ModUnit_", sp,sep="")
    
    
    
    
    
    
    if (file.exists(paste(SPSDMDir, "/", sp, ".html", sep=""))){ #safety, make sure again that output exists
    #if (file.exists(paste(SPSDMDir, "/", sp, ".html", sep="")) & 
    #    !(file.exists(paste(OutDirCrossval,"/permutation.importance/ModUnit_", sp, "/", sp, "_0.html", sep="")))){ #safety, make sure again that output exists
      
      
      
      
      
      #define species that actually have existing results:
      spExists<-c(spExists,sp)
      
      spRes<-read.csv(paste(SPSDMDir, "/maxentResults.csv", sep=""),
                      sep=",", header=TRUE)   #read in results for species crossvalid Maxent models
      sprecs<-read.csv(paste(SWDDir,"/ModUnit_",sp,".csv",sep=""), 
                       sep=",", header=TRUE)   #read in clean species occurrence records
      
      #define variables included in results table (these may be less than in full background):
      spvars<-gsub(".contribution","",grep("contribution$", names(spRes), value = TRUE))
      
      Test.gain<-spRes$Test.gain[length(spRes$Test.gain)]
      AUC<-spRes$Test.AUC[length(spRes$Test.AUC)]
      
      Sys.sleep(0.1)
      print(paste(Sys.time(), n, "performing variable selection for", sp, sep=" "))
      
      for (crit in crits){ #for each selection criterion (just permutation importance in this case)
        
        
        #############################################################
        ### read species results for variable selection criterion ###
        #############################################################
        
        j<-which(crits==crit) #get number of this criterion in criteria list
        colNos<-vector() #make empty vector for column numbers in results relevant for this criterion
        
        for (var in spvars){ #get col numbers for relevant variables results
          varColName<-ifelse(crit %in% c("permutation.importance","contribution"),
                             paste(var,".",crit,sep=""),
                             paste(crit,".",var,sep=""))
          i<-which(names(spRes)==paste(varColName))
          colNos<-c(colNos,i)
        }
        
        critVal<-as.numeric(spRes[length(spRes[,1]),colNos]) #get mean criterion values for all results (across 10 crossvalidated replicates)
        
        #For some criteria the values need to be converted
        if(crit %in% c("Test.gain.without","Test.gain.with.only","AUC.without","AUC.with.only")){
          total<-ifelse(crit %in% c("Test.gain.without","Test.gain.with.only"),Test.gain,AUC)
          critVal<-critVal/(total/100)
          if(crit %in% c("Test.gain.without","AUC.without")){
            critVal<-100-critVal
          }
        }
        
        spMeans<- data.frame(spvars,critVal)   #make data frame with variable names and their mean criterion values
        spMeans$rank<- rank(-spMeans$critVal)  #create vector of ranks (highest value=best rank) for the variables for this species according to the current criterion
        
        #these are the ranked variables with more than 1% perm imp 
        #(or whatever criterion and threshold was used):
        
        l<-length(spMeans[spMeans$critVal>=critThresh[j],1]) #number of variables with acceptable crit value
        
        spMeansRed<- spMeans[order(spMeans$rank), ] 
        row.names(spMeansRed)<-c(1:length(spMeansRed[,1]))
        
        
        
        ########################################
        ### adjust number of final variables####
        ########################################
        
        #if there's more than 5 variables that pass the threshold for being suitable predictors, continue with those for the final MaxEnt run, otherwise 
        #(if there's only 1 or 2 good predictors) use the best 5 irrespective of whether they're above the threshold or not
        if(l>=5){
          spMeansRed<-spMeansRed[spMeansRed$critVal>=critThresh[j],] 
        } else{
          spMeansRed<- spMeansRed[1:5,]
        }
        
        #further cut variable list to number of records for species/20 (ie a species with up 
        #to 200 records can only have a max of 10 predictor variabes)
        recNo<-length(sprecs[,1])
        
        spMeansRed<- spMeansRed[order(spMeansRed$rank), ] #redo sorting just in case
        
        if(recNo<=100){                        #if there's less than 100 records
          if(length(spMeansRed[,1])>5){       #...and if there is more than 5 relevant variables
            spMeansRed<-spMeansRed[1:5,]      #shorten the variable list to 5 (>60/20= >3)
            #any sp with more than 100 recs get x/20 variables, e.g. 100/20=5, 160/20=8, 200/20=10, 
          }} else {
            if (length(spMeansRed[,1])>recNo/20){   #if there is more relevant variables than the number of records divided by 20
              spMeansRed<-spMeansRed[1:(recNo/20),] #shorten the variable list to recNo/20
            }                                       #otherwise, i.e. if the list is already short enough, leave as is
          }
        
        
        
        ##############################################
        ### create summary table of all species results###
        ##############################################
        
        #write summary vector of this species results for this criterion and add as new row to overall results
        spSum<-c(as.character(sp),crit,spRes$X.Training.samples[length(spRes[,1])],spRes$Test.gain[length(spRes[,1])],spRes$Test.AUC[length(spRes[,1])],
                 spRes$X.Background.points[length(spRes[,1])],rep("NA",length(allVars)*2))
        spSum<-as.data.frame(t(spSum))
        names(spSum)<-c("species","criterion","TrainSamples","TestGain","TestAUC","BgPoints",
                        allVars,paste(allVars,"_rank",sep=""))
        
        
        
        
        
        
        
        #if this is not the first species from the list or if this is a redo of just some of the species 
        #(i.e. I don't want to recreate the results table but just substitute some species with updted values)...
        #read in previously saved table, delete existing data for this species and then add back in updated data
        
        if(!(1 %in% n) | length(mapsppB)!=length(mapspp)){
          maxEntRes<-read.csv(paste(FullSDMDir, "/",projectName,"_AllSppRes.csv",sep=""), sep=",", header=TRUE)
          maxEntRes<-maxEntRes[maxEntRes$species!=sp,]
        }
          
        maxEntRes<-maxEntRes[maxEntRes$species %in% mapspp] #delete rows for any species that are no longer in my list
        maxEntRes<-na.omit(maxEntRes)
        
        maxEntRes<-rbind(maxEntRes,spSum)
        maxEntRes<-data.frame(lapply(maxEntRes, as.character), stringsAsFactors=FALSE)
        
        #substitute results for variables used for this species in table (other stay "NA"):
        for (var in spMeans$spvars){
          maxEntRes[maxEntRes$species==sp,var]<-spMeans[spMeans$spvars==var,"critVal"]
          maxEntRes[maxEntRes$species==sp,paste(var,"_rank",sep="")]<-spMeans[spMeans$spvars==var,"rank"]
        }
        
        #write .csv file of MaxEnt results for all species (write backup version in every iteration, 
        #but only last 'full' table will exist at end of loop):

        write.table(x=maxEntRes, file=paste(FullSDMDir, "/",projectName,"_AllSppRes.csv",sep=""),   
                    na = "NA", row.names = FALSE, col.names = TRUE, sep=",")   
 
        ####################################################################
        ## prep MaxEnt model for this species with only these variables ####
        ####################################################################
        
 
        ##########################
        if (sp %in% gsub(" ","_",mapsppB)){
          ##########################
          

        vars<-spMeansRed$spvars #get subset of variables included in reduced species summary of good predictors
        
        ###################################################################
        #make species swd file (by subsetting original full swd file): ####
        Sys.sleep(0.1)
        print(paste(Sys.time(), "making swd for", sp, crit, "- final vars:", sep=" "))
        print(paste(vars))
        spswd<-read.table(paste(SWDDir,"/ModUnit_",sp,".csv",sep=""), sep=",", header=TRUE)   #read in background file
        spswd<-spswd[,c("species","longitude","latitude",paste(vars))]
        write.table(x=spswd, file=paste(SWDDir, "/ModUnit_", sp,"_",crit,"_final.csv",sep=""),   
                    na = "NA", row.names = FALSE, col.names = TRUE, sep=",")     #write swd file for final variables
        
        ###################################################################
        #make background swd file (by subsetting full bg swd file): #######
        Sys.sleep(0.1)
        print(paste(Sys.time(), "making background for", sp, crit, sep=" "))
        #read in appropriate original background used for full MaxEnt model
        #(only keep variable columns selected for this species):
        background.swd <- paste(MyDir, "/Maxent_SWDs/backgrounds/ModUnit_",sp,"_BG.csv",sep="")
        backgr<-read.table(paste(background.swd, sep=""), sep=",", header=TRUE)
        backgr<-backgr[,c("species","longitude","latitude",paste(vars))]
        
        write.table(x=backgr, file=paste(BackgrDir, "/ModUnit_", sp,"_",crit,"_bgSWD.csv",sep=""),   
                    na = "NA", row.names = FALSE, col.names = TRUE, sep=",")     #write .csv file of records associated with variable values (swd)
        
        Sys.sleep(0.1)
        print(paste(Sys.time(), "swd and background for", sp, crit, "done",sep=" "))
        
        
        
        ####################################################################
        ## re-run MaxEnt model for this species with only these variables ##
        ####################################################################
        
        # define swd, bg files, and sp vars:
        background.swd <- paste(BackgrDir, "/ModUnit_", sp, "_",crit,"_bgSWD.csv",sep="")
        occurrences.swd <- paste(SWDDir,"/ModUnit_",sp,"_",crit,"_final.csv",sep="")
        #backgr<-read.table(paste(background.swd), sep=",", header=TRUE)   #read in background file
        sprecs<-read.table(paste(occurrences.swd), sep=",", header=TRUE)   #read in species file
        #vars<-names(sprecs)[4:length(sprecs)]
        
        rangewidth<-max((max(na.omit(sprecs$long))-min(na.omit(sprecs$long))),(max(na.omit(sprecs$lat))-min(na.omit(sprecs$lat))))
        rangewidthM<-rangewidth*100000
        rangewidthM<-ifelse(rangewidthM>3000000,3000000,rangewidthM)
        rangewidthM<-ifelse(rangewidthM<1000000,1000000,rangewidthM)
        
        # create a folder in the output directory for this species and define as species specific output directory:
        dir.create(path=paste(OutDirCrossval, "/", crit,"/ModUnit_",sp, sep=""), showWarnings = FALSE, recursive=TRUE)
        dir.create(path=paste(OutDirFinal, "/", crit,"/ModUnit_",sp, sep=""), showWarnings = FALSE,recursive=TRUE)
        SpOutDir<-paste(OutDirCrossval, "/", crit,"/ModUnit_",sp, sep="")
        SpOutDirFin<-paste(OutDirFinal, "/", crit,"/ModUnit_",sp, sep="")
        
        #product feature decision
        
        if(rangewidthM <= 1000000 & length(sprecs[,1]) <80){
          pfs<-paste("noproduct")
        } else {
          pfs<-paste("product")
        }
        
        
        # write shell file: run final Maxent models (crossval & full) & project:
        
        # create the shell file for each species model job & submit to HPC
        shell.file.name <- paste(PbsDir, "/ModUnit_", sp,"_",crit, "_final.sh", sep="")
        shell.file <- file(shell.file.name, "w")
        
        ################################################
        ################################################
           
        #complusory bits for PBS
        cat('#!/bin/bash\n', file=shell.file)
        cat('#PBS -j oe\n', file=shell.file)                                  # combine stdout and stderr into one file
        cat('#PBS -m ae\n', file=shell.file)                                  # combine stdout and stderr into one file
        cat('#PBS -l walltime=50:00:00\n', sep='', file=shell.file)          # define walltime hh:mm:ss !!!!!!!!!!!!!!!!!!
        cat('#PBS -l select=1:ncpus=1:mem=70gb\n', sep='', file=shell.file)   # allocate memory !!!!!!!!!!!!!!!!!
        
        cat('cd $PBS_O_WORKDIR\n', file=shell.file)
        cat('shopt -s expand_aliases\n', file=shell.file)
        cat('source /etc/profile.d/modules.sh\n', file=shell.file)            # need to access modules for java
        cat('echo "Job identifier is $PBS_JOBID"\n', file=shell.file)         # need this to run R
        cat('echo "Working directory is $PBS_O_WORKDIR"\n', file=shell.file)  # need this to run R
        cat('module load java\n', file=shell.file)                            # need to load java for maxent
        
        # run maxent crossvaidation (70% subsamples) for stats:
        cat('java -mx',paste(70*1000),'m -XX:-UseGCOverheadLimit -jar ',maxent.jar, ' -s ',occurrences.swd, ' -e ',background.swd, ' -o ', SpOutDir,   ' -t ',catVars,' nowarnings novisible replicatetype=subsample randomtestpoints=30 replicates=',crossvals,' nooutputgrids noextrapolate dontextrapolate nothreshold allowpartialdata ',pfs,' -P -J -r -a \n', sep='', file=shell.file)

        # run full maxent model:
        cat('java -mx',paste(70*1000),'m -XX:-UseGCOverheadLimit -jar ',maxent.jar, ' -s ',occurrences.swd, ' -e ',background.swd, ' -o ',SpOutDirFin, ' -t ',catVars,' nowarnings novisible nowriteclampgrid noextrapolate dontextrapolate nothreshold writemess nothreshold allowpartialdata ',pfs,' -P -J -r -a \n', sep="", file=shell.file) 

        # project maxent model:
        outfilename <- paste(SpOutDirFin, "/ModUnit_", sp, "_current", sep="")
        cat('java -cp ',maxent.jar, ' density.Project ',SpOutDirFin, '/',sp, '.lambdas ',LayersDir, ' ', outfilename, '.asc nowriteclampgrid noextrapolate dontextrapolate \n', sep="", file=shell.file)


        #create limiting factor map:
        limitfilename <- paste(SpOutDirFin, "/ModUnit_", sp, "_limitFactor", sep="")
        cat('java -cp ',maxent.jar,' density.tools.LimitingFactor ' , SpOutDirFin, '/', sp, '.lambdas ', LayersDir, ' ', limitfilename, '.asc', sep="",file=shell.file)
        
        # zip output ascis:
        #cat('gzip -f ', outfilename, '.asc\n', sep="", file=shell.file)
        #cat('gzip -f ', limitfilename, '.asc\n', sep="", file=shell.file)
        
        #close shellfile
        close(shell.file)
        
        write(paste("qsub ", shell.file.name, sep=""), file=qsub.list.file, append=T) 
        
        #shell.file.name <- paste(PbsDir, "/ModUnit_", sp,"_",crit, "_final.sh", sep="")
        #system(paste("qsub -p 100 ", shell.file.name, sep=""))
        #Sys.sleep(5)
        
      }
    }
    
  ###########  
  }
  ###########
  
}}

#cd /home/jc217070/Maxent/tmp.pbs
#bash /home/jc217070/Maxent/R_scripts/qsubs-4-VAP_redo.txt


################################################################################################
################################################################################################
################################################################################################
################################################################################################
################################################################################################

############################################
############################################
# next: 5-VAP_CurrentPlotsThresh ###########
############################################




